bench1 <- fs::dir_ls(path = "one-run/", glob = "*tsv") %>%
vroom(id = "path") %>%
mutate(path = str_remove(path, "one-run/train_ver9_")) %>%
mutate(path = str_remove(path, "_25.07.2023-Q2-2023.5.tsv")) %>%
separate(path, c("id", "type"), sep = "_", extra = "merge", remove = F) |>
mutate(mem_GBs = max_vms/1024) |>
left_join(qzas)
## Rows: 8 Columns: 11
## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## dbl (9): s, max_rss, max_vms, max_uss, max_pss, io_in, io_out, mean_load, c...
## time (1): h:m:s
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 2 rows [1, 5].
## Joining with `by = join_by(path)`
plotcolors <- scale_fill_brewer(palette = "Set2")
bench1 %>% ggplot(aes(x = type, y = s / 60 / 60, fill = id)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Runtime in hours") +
plotcolors
bench1 %>% ggplot(aes(x = type, y = max_rss / 1000, fill = id)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Maximum memory usage in GBs") +
plotcolors
Full summary on StackOverflow.
bench1 |>
pivot_longer(starts_with("max"), values_to = "MB") |>
ggplot(aes(x = paste(id, type), y = MB/1024, fill = name)) +
geom_bar(stat = "identity", position = "dodge")
Parameter sweep reads-per-chunk settings for skl-classifiers.
What read numbers should we test, along a log scale?
data.frame(x = 3 + (0:10) / 10) %>%
mutate(y = round(10^x))
## x y
## 1 3.0 1000
## 2 3.1 1259
## 3 3.2 1585
## 4 3.3 1995
## 5 3.4 2512
## 6 3.5 3162
## 7 3.6 3981
## 8 3.7 5012
## 9 3.8 6310
## 10 3.9 7943
## 11 4.0 10000
bench2 <- fs::dir_ls(path = "batch-size/", glob = "*tsv") %>%
vroom(id = "path") %>%
mutate(path = str_remove(path, "batch-size/train_ver9_99_25.07.2023-Q2-2023.5-chunk")) %>%
mutate(path = str_remove(path, ".tsv")) %>%
mutate(path = as.numeric(path)) %>%
mutate(type = as.factor(as.numeric(path)))
## Rows: 16 Columns: 11
## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## dbl (9): s, max_rss, max_vms, max_uss, max_pss, io_in, io_out, mean_load, c...
## time (1): h:m:s
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
plotcolors <- scale_fill_brewer(palette = "Set2")
bench2 %>% ggplot(aes(x = path, y = s / 60 / 60)) +
geom_bar(stat = "identity", position = "dodge") +
scale_x_log10() +
labs(title = "Runtime in hours") +
plotcolors
bench2 %>% ggplot(aes(x = type, y = max_rss / 1000)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Maximum memory usage in GBs") +
plotcolors
bench1 |>
ggscatter(x = "s", y = "mem_GBs", add = "reg.line") +
# scale_x_log10() + scale_y_log10() +
stat_regline_equation(label.y = 1.2e2) +
stat_cor(label.y = 1.1e2)
bench1 |>
ggscatter(x = "io_in", y = "mem_GBs", add = "reg.line") +
# scale_x_log10() + scale_y_log10() +
stat_regline_equation(label.x = 7777, label.y = 1.2e2) +
stat_cor(label.x = 7777, , label.y = 1.1e2)
bench1 |>
ggscatter(x = "size", y = "max_vms", add = "reg.line") +
# scale_x_log10() + scale_y_log10() +
stat_regline_equation(label.x = 10, label.y = 1.2e5) +
stat_cor(label.x = 10, , label.y = 1.1e5)
bench1 |>
mutate(GBsOut = io_out/1024) |>
ggscatter(x = "GBsOut", y = "mem_GBs", add = "reg.line") +
# scale_x_log10() + scale_y_log10() +
stat_regline_equation(label.x = 10, label.y = 1.2e2) +
stat_cor(label.x = 10, , label.y = 1.1e2)
Ladies and gentlemen, got ‘em’
ggplotly(
bench1 |>
mutate(GBsOut = io_out/1024) |>
ggplot(aes(x = GBsOut, y = mem_GBs, color = type, shape = id)) +
geom_point(size = 3) +
geom_line(alpha = 0.5, size = 2, aes(group = type)) +
scale_x_log10() + scale_y_log10()
)